{
printk("Guest: events = %08lx, events_mask = %08lx\n",
s->events, s->events_mask);
-
- if ( (v = find_vif_by_id((p->domain)<<VIF_DOMAIN_SHIFT)) != NULL )
- {
- printk("rx_prod=%d ,rx_cons=%d, tx_prod=%d, tx_cons=%d\n",
- v->rx_prod, v->rx_cons, v->tx_prod, v->tx_cons );
- printk("rx_req_cons=%d, rx_resp_prod=%d, "
- "tx_req_cons=%d, tx_resp_prod=%d\n",
- v->rx_req_cons, v->rx_resp_prod,
- v->tx_req_cons, v->tx_resp_prod);
- put_vif(v);
- }
printk("Notifying guest...\n");
set_bit(_EVENT_DEBUG, &s->events);
}
-/*
+/******************************************************************************
* xen_block.c
*
* process incoming block io requests from guestos's.
#include <xeno/slab.h>
/*
- * These are rather arbitrary. They are fairly large because adjacent
- * requests pulled from a communication ring are quite likely to end
- * up being part of the same scatter/gather request at the disc.
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
*
* ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
* This will increase the chances of being able to write whole tracks.
- * '64' should be enough to keep us competitive with Linux.
+ * 64 should be enough to keep us competitive with Linux.
*/
#define MAX_PENDING_REQS 64
#define BATCH_PER_DOMAIN 16
/*
- * Each outstanding request which we've passed to the lower device layers
- * has a 'pending_req' allocated to it. Each buffer_head that completes
- * decrements the pendcnt towards zero. When it hits zero, the specified
- * domain has a response queued for it, with the saved 'id' passed back.
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
*
- * We can't allocate pending_req's in order, since they may complete out
- * of order. We therefore maintain an allocation ring. This ring also
- * indicates when enough work has been passed down -- at that point the
- * allocation ring will be empty.
+ * We can't allocate pending_req's in order, since they may complete out of
+ * order. We therefore maintain an allocation ring. This ring also indicates
+ * when enough work has been passed down -- at that point the allocation ring
+ * will be empty.
*/
static pending_req_t pending_reqs[MAX_PENDING_REQS];
static unsigned char pending_ring[MAX_PENDING_REQS];
-static unsigned int pending_prod, pending_cons;
static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-#define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1))
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
static kmem_cache_t *buffer_head_cachep;
-static atomic_t nr_pending;
static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
static void io_schedule(unsigned long unused);
static int do_block_io_op_domain(struct task_struct *p, int max_to_do);
-static void dispatch_rw_block_io(struct task_struct *p, int index);
-static void dispatch_debug_block_io(struct task_struct *p, int index);
+static void dispatch_rw_block_io(struct task_struct *p,
+ blk_ring_req_entry_t *req);
static void make_response(struct task_struct *p, unsigned long id,
unsigned short op, unsigned long st);
struct list_head *ent;
/* Queue up a batch of requests. */
- while ( (atomic_read(&nr_pending) < MAX_PENDING_REQS) &&
+ while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
!list_empty(&io_schedule_list) )
{
ent = io_schedule_list.next;
*/
smp_mb();
- if ( (atomic_read(&nr_pending) < (MAX_PENDING_REQS/2)) &&
+ if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
!list_empty(&io_schedule_list) )
- {
tasklet_schedule(&io_schedule_tasklet);
- }
}
pending_req->operation, pending_req->status);
put_task_struct(pending_req->domain);
spin_lock(&pend_prod_lock);
- pending_ring[pending_prod] = pending_req - pending_reqs;
- PENDREQ_IDX_INC(pending_prod);
+ pending_ring[MASK_PEND_IDX(pending_prod)] =
+ pending_req - pending_reqs;
+ pending_prod++;
spin_unlock(&pend_prod_lock);
- atomic_dec(&nr_pending);
maybe_trigger_io_schedule();
}
block_io_op_t op;
struct task_struct *p = current;
- if ( copy_from_user(&op, u_block_io_op, sizeof(op)) )
+ if ( unlikely(copy_from_user(&op, u_block_io_op, sizeof(op)) != 0) )
return -EFAULT;
switch ( op.cmd )
case BLOCK_IO_OP_VBD_PROBE:
/* query VBD information for self or others (or all) */
- ret = vbd_probe(&op.u.probe_params);
- if(ret == 0)
+ if ( (ret = vbd_probe(&op.u.probe_params)) == 0 )
copy_to_user(u_block_io_op, &op, sizeof(op));
break;
case BLOCK_IO_OP_VBD_INFO:
/* query information about a particular VBD */
- ret = vbd_info(&op.u.info_params);
- if(ret == 0)
+ if ( (ret = vbd_info(&op.u.info_params)) == 0 )
copy_to_user(u_block_io_op, &op, sizeof(op));
break;
-
+
default:
ret = -ENOSYS;
}
static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
{
blk_ring_t *blk_ring = p->blk_ring_base;
- int i, more_to_do = 0;
+ blk_ring_req_entry_t *req;
+ BLK_RING_IDX i;
+ int more_to_do = 0;
- /*
- * Take items off the comms ring, taking care not to catch up
- * with the response-producer index.
- */
+ /* Take items off the comms ring, taking care not to overflow. */
for ( i = p->blk_req_cons;
- (i != blk_ring->req_prod) &&
- (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1);
- i = BLK_RING_INC(i) )
+ (i != blk_ring->req_prod) && ((i-p->blk_resp_prod) != BLK_RING_SIZE);
+ i++ )
{
- if ( (max_to_do-- == 0) ||
- (atomic_read(&nr_pending) == MAX_PENDING_REQS) )
+ if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
{
more_to_do = 1;
break;
}
- switch ( blk_ring->ring[i].req.operation )
+ req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
+ switch ( req->operation )
{
case XEN_BLOCK_READ:
case XEN_BLOCK_WRITE:
- dispatch_rw_block_io(p, i);
- break;
-
- case XEN_BLOCK_DEBUG:
- dispatch_debug_block_io(p, i);
+ dispatch_rw_block_io(p, req);
break;
default:
return more_to_do;
}
-static void dispatch_debug_block_io(struct task_struct *p, int index)
-{
- DPRINTK("dispatch_debug_block_io: unimplemented\n");
-}
-
-static void dispatch_rw_block_io(struct task_struct *p, int index)
+static void dispatch_rw_block_io(struct task_struct *p,
+ blk_ring_req_entry_t *req)
{
extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
- blk_ring_t *blk_ring = p->blk_ring_base;
- blk_ring_req_entry_t *req = &blk_ring->ring[index].req;
struct buffer_head *bh;
int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
unsigned short nr_sects;
}
/*
- * XXX Clear any 'partition' info in device. This works because IDE
- * ignores the partition bits anyway. Only SCSI needs this hack,
- * and it has four bits to clear.
+ * Clear any 'partition' bits in the device id. This works because
+ * IDE ignores the partition bits anyway. Only SCSI needs this
+ * hack, and we know that always requires the four LSBs cleared.
*/
phys_seg[nr_psegs].dev = req->device & 0xFFF0;
new_segs = 1;
}
}
- atomic_inc(&nr_pending);
- pending_req = pending_reqs + pending_ring[pending_cons];
- PENDREQ_IDX_INC(pending_cons);
+ pending_req = &pending_reqs[pending_ring[MASK_PEND_IDX(pending_cons++)]];
pending_req->domain = p;
pending_req->id = req->id;
pending_req->operation = operation;
unsigned short op, unsigned long st)
{
unsigned long cpu_mask;
- int position;
- blk_ring_t *blk_ring;
+ blk_ring_resp_entry_t *resp;
/* Place on the response ring for the relevant domain. */
spin_lock(&p->blk_ring_lock);
- blk_ring = p->blk_ring_base;
- position = p->blk_resp_prod;
- blk_ring->ring[position].resp.id = id;
- blk_ring->ring[position].resp.operation = op;
- blk_ring->ring[position].resp.status = st;
- p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
+ resp = &p->blk_ring_base->ring[MASK_BLK_IDX(p->blk_resp_prod)].resp;
+ resp->id = id;
+ resp->operation = op;
+ resp->status = st;
+ wmb();
+ p->blk_ring_base->resp_prod = ++p->blk_resp_prod;
spin_unlock(&p->blk_ring_lock);
/* Kick the relevant domain. */
{
unsigned long flags;
struct task_struct *p;
- blk_ring_t *blk_ring ;
+ blk_ring_t *blk_ring;
int i;
- printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n",
- atomic_read(&nr_pending), pending_prod, pending_cons);
+ printk("Dumping block queue stats: nr_pending = %d"
+ " (prod=0x%08x,cons=0x%08x)\n",
+ NR_PENDING_REQS, pending_prod, pending_cons);
read_lock_irqsave(&tasklist_lock, flags);
p = &idle0_task;
printk("Domain: %d\n", p->domain);
blk_ring = p->blk_ring_base;
- printk(" req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n",
+ printk(" req_prod:0x%08x, req_cons:0x%08x resp_prod:0x%08x/"
+ "0x%08x on_list=%d\n",
blk_ring->req_prod, p->blk_req_cons,
blk_ring->resp_prod, p->blk_resp_prod,
__on_blkdev_list(p));
/* Start-of-day initialisation for a new domain. */
void init_blkdev_info(struct task_struct *p)
{
- if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
+ if ( unlikely(sizeof(*p->blk_ring_base) > PAGE_SIZE) )
+ BUG();
+
p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
clear_page(p->blk_ring_base);
SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
{
int i;
- atomic_set(&nr_pending, 0);
- pending_prod = pending_cons = 0;
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
memset(pending_reqs, 0, sizeof(pending_reqs));
for ( i = 0; i < MAX_PENDING_REQS; i++ )
pending_ring[i] = i;
/* NB. Ring size must be small enough for sizeof(blk_ring_t) <= PAGE_SIZE. */
#define BLK_RING_SIZE 64
-#define BLK_RING_INC(_i) (((_i)+1) & (BLK_RING_SIZE-1))
/*
* Maximum scatter/gather segments per request.
unsigned long status; /* cuurently boolean good/bad */
} blk_ring_resp_entry_t;
+/*
+ * We use a special capitalised type name because it is _essential_ that all
+ * arithmetic on indexes is done on an integer type of the correct size.
+ */
+typedef unsigned int BLK_RING_IDX;
+
+/*
+ * Ring indexes are 'free running'. That is, they are not stored modulo the
+ * size of the ring buffer. The following macro converts a free-running counter
+ * into a value that can directly index a ring-buffer array.
+ */
+#define MASK_BLK_IDX(_i) ((_i)&(BLK_RING_SIZE-1))
+
typedef struct blk_ring_st
{
- unsigned int req_prod; /* Request producer. Updated by guest OS. */
- unsigned int resp_prod; /* Response producer. Updated by Xen. */
+ BLK_RING_IDX req_prod; /* Request producer. Updated by guest OS. */
+ BLK_RING_IDX resp_prod; /* Response producer. Updated by Xen. */
union {
blk_ring_req_entry_t req;
blk_ring_resp_entry_t resp;
rx_entry_t rx_ring[RX_RING_SIZE];
} net_ring_t;
+/*
+ * We use a special capitalised type name because it is _essential_ that all
+ * arithmetic on indexes is done on an integer type of the correct size.
+ */
+typedef unsigned int NET_RING_IDX;
+
+/*
+ * Ring indexes are 'free running'. That is, they are not stored modulo the
+ * size of the ring buffer. The following macros convert a free-running counter
+ * into a value that can directly index a ring-buffer array.
+ */
+#define MASK_NET_RX_IDX(_i) ((_i)&(RX_RING_SIZE-1))
+#define MASK_NET_TX_IDX(_i) ((_i)&(TX_RING_SIZE-1))
+
typedef struct net_idx_st
{
/*
* Guest OS places empty buffers into ring at rx_req_prod.
* Guest OS receives EVENT_NET when rx_rssp_prod passes rx_event.
*/
- unsigned int tx_req_prod, tx_resp_prod, tx_event;
- unsigned int rx_req_prod, rx_resp_prod, rx_event;
+ NET_RING_IDX tx_req_prod, tx_resp_prod, tx_event;
+ NET_RING_IDX rx_req_prod, rx_resp_prod, rx_event;
} net_idx_t;
/*
/* Block I/O */
blk_ring_t *blk_ring_base;
- unsigned int blk_req_cons; /* request consumer */
- unsigned int blk_resp_prod; /* (private version of) response producer */
+ BLK_RING_IDX blk_req_cons; /* request consumer */
+ BLK_RING_IDX blk_resp_prod; /* (private version of) response producer */
struct list_head blkdev_list;
spinlock_t blk_ring_lock;
vbd_t *vbdtab[VBD_HTAB_SZ]; /* mapping from 16-bit vdevices to vbds */
/* The private rings and indexes. */
rx_shadow_entry_t rx_shadow_ring[RX_RING_SIZE];
- unsigned int rx_prod; /* More buffers for filling go here. */
- unsigned int rx_cons; /* Next buffer to fill is here. */
+ NET_RING_IDX rx_prod; /* More buffers for filling go here. */
+ NET_RING_IDX rx_cons; /* Next buffer to fill is here. */
tx_shadow_entry_t tx_shadow_ring[TX_RING_SIZE];
- unsigned int tx_prod; /* More packets for sending go here. */
- unsigned int tx_cons; /* Next packet to send is here. */
+ NET_RING_IDX tx_prod; /* More packets for sending go here. */
+ NET_RING_IDX tx_cons; /* Next packet to send is here. */
/* Private indexes into shared ring. */
- unsigned int rx_req_cons;
- unsigned int rx_resp_prod; /* private version of shared variable */
- unsigned int tx_req_cons;
- unsigned int tx_resp_prod; /* private version of shared variable */
+ NET_RING_IDX rx_req_cons;
+ NET_RING_IDX rx_resp_prod; /* private version of shared variable */
+ NET_RING_IDX tx_req_cons;
+ NET_RING_IDX tx_resp_prod; /* private version of shared variable */
/* Usage accounting */
long long total_bytes_sent;
#define rtnl_lock() ((void)0)
#define rtnl_unlock() ((void)0)
-#define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
-#define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
-#define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
-#define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
-
struct skb_completion_queues skb_queue[NR_CPUS] __cacheline_aligned;
static int get_tx_bufs(net_vif_t *vif);
rx_shadow_entry_t *rx;
unsigned long *ptep, pte;
struct pfn_info *old_page, *new_page, *pte_page;
- unsigned int i;
unsigned short size;
unsigned char offset, status = RING_STATUS_OK;
struct task_struct *p = vif->domain;
spin_lock(&vif->rx_lock);
- if ( (i = vif->rx_cons) == vif->rx_prod )
+ if ( unlikely(vif->rx_cons == vif->rx_prod) )
{
spin_unlock(&vif->rx_lock);
perfc_incr(net_rx_capacity_drop);
return;
}
- rx = &vif->rx_shadow_ring[i];
- vif->rx_cons = RX_RING_INC(i);
+ rx = &vif->rx_shadow_ring[MASK_NET_RX_IDX(vif->rx_cons++)];
size = (unsigned short)skb->len;
offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
add_to_net_schedule_list_tail(vif);
- if ( (skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL )
+ if ( unlikely((skb = alloc_skb_nodata(GFP_ATOMIC)) == NULL) )
{
printk("Out of memory in net_tx_action()!\n");
add_to_net_schedule_list_tail(vif);
}
/* Pick an entry from the transmit queue. */
- tx = &vif->tx_shadow_ring[vif->tx_cons];
- vif->tx_cons = TX_RING_INC(vif->tx_cons);
+ tx = &vif->tx_shadow_ring[MASK_NET_TX_IDX(vif->tx_cons++)];
skb->destructor = tx_skb_release;
vif->total_bytes_sent += tx->size;
/* Is the NIC crap? */
- if ( !(dev->features & NETIF_F_SG) )
+ if ( unlikely(!(dev->features & NETIF_F_SG)) )
{
nskb = skb_copy(skb, GFP_KERNEL);
kfree_skb(skb);
}
/* Transmit should always work, or the queue would be stopped. */
- if ( dev->hard_start_xmit(skb, dev) != 0 )
+ if ( unlikely(dev->hard_start_xmit(skb, dev) != 0) )
{
printk("Weird failure in hard_start_xmit!\n");
kfree_skb(skb);
unsigned short protocol;
struct sk_buff *skb;
tx_req_entry_t tx;
- int i, j, ret = 0;
+ tx_shadow_entry_t *stx;
+ NET_RING_IDX i, j;
+ int ret = 0;
if ( vif->tx_req_cons == shared_idxs->tx_req_prod )
return 0;
again:
for ( i = vif->tx_req_cons;
(i != shared_idxs->tx_req_prod) &&
- (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
- i = TX_RING_INC(i) )
+ ((i-vif->tx_resp_prod) != TX_RING_SIZE);
+ i++ )
{
- tx = shared_rings->tx_ring[i].req;
+ tx = shared_rings->tx_ring[MASK_NET_TX_IDX(i)].req;
target = VIF_DROP;
- if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
+ if ( unlikely(tx.size <= PKT_PROT_LEN) ||
+ unlikely(tx.size > ETH_FRAME_LEN) )
{
DPRINTK("Bad packet size: %d\n", tx.size);
make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
if ( VIF_LOCAL(target) )
{
/* Local delivery */
- if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
+ if ( unlikely((skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL) )
{
make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
put_vif(target);
}
else if ( (target == VIF_PHYS) || IS_PRIV(p) )
{
- vif->tx_shadow_ring[j].id = tx.id;
- vif->tx_shadow_ring[j].size = tx.size;
- vif->tx_shadow_ring[j].header =
- kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
- if ( vif->tx_shadow_ring[j].header == NULL )
+ stx = &vif->tx_shadow_ring[MASK_NET_TX_IDX(j)];
+ stx->id = tx.id;
+ stx->size = tx.size;
+ stx->header = kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
+ if ( unlikely(stx->header == NULL) )
{
make_tx_response(vif, tx.id, RING_STATUS_OK);
goto cleanup_and_continue;
}
- memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
- vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
+ memcpy(stx->header, g_data, PKT_PROT_LEN);
+ stx->payload = tx.addr + PKT_PROT_LEN;
+
+ j++;
buf_page = NULL; /* hand off our page reference */
- j = TX_RING_INC(j);
}
else
{
struct task_struct *p = vif->domain;
net_ring_t *shared_rings = vif->shared_rings;
net_idx_t *shared_idxs = vif->shared_idxs;
- unsigned int i, j;
+ NET_RING_IDX i, j;
rx_req_entry_t rx;
+ rx_shadow_entry_t *srx;
unsigned long pte_pfn, buf_pfn;
struct pfn_info *pte_page, *buf_page;
unsigned long *ptep, pte;
j = vif->rx_prod;
for ( i = vif->rx_req_cons;
(i != shared_idxs->rx_req_prod) &&
- (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
- i = RX_RING_INC(i) )
+ ((i-vif->rx_resp_prod) != RX_RING_SIZE);
+ i++ )
{
- rx = shared_rings->rx_ring[i].req;
+ rx = shared_rings->rx_ring[MASK_NET_RX_IDX(i)].req;
pte_pfn = rx.addr >> PAGE_SHIFT;
pte_page = &frame_table[pte_pfn];
list_del(&buf_page->list);
spin_unlock(&p->page_list_lock);
- vif->rx_shadow_ring[j].id = rx.id;
- vif->rx_shadow_ring[j].pte_ptr = rx.addr;
- vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
- j = RX_RING_INC(j);
+ srx = &vif->rx_shadow_ring[MASK_NET_RX_IDX(j++)];
+ srx->id = rx.id;
+ srx->pte_ptr = rx.addr;
+ srx->buf_pfn = buf_pfn;
rx_unmap_and_continue:
unmap_domain_mem(ptep);
long flush_bufs_for_vif(net_vif_t *vif)
{
- int i;
+ NET_RING_IDX i;
unsigned long *ptep, pte;
struct pfn_info *page;
struct task_struct *p = vif->domain;
/* Return any outstanding receive buffers to the guest OS. */
spin_lock(&vif->rx_lock);
for ( i = vif->rx_req_cons;
- (i != shared_idxs->rx_req_prod) &&
- (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
- i = RX_RING_INC(i) )
+ (i != shared_idxs->rx_req_prod) &&
+ ((i-vif->rx_resp_prod) != RX_RING_SIZE);
+ i++ );
{
- make_rx_response(vif, shared_rings->rx_ring[i].req.id, 0,
- RING_STATUS_DROPPED, 0);
+ make_rx_response(vif, shared_rings->rx_ring[MASK_NET_RX_IDX(i)].req.id,
+ 0, RING_STATUS_DROPPED, 0);
}
vif->rx_req_cons = i;
- for ( i = vif->rx_cons; i != vif->rx_prod; i = RX_RING_INC(i) )
+ for ( i = vif->rx_cons; i != vif->rx_prod; i++ )
{
- rx = &vif->rx_shadow_ring[i];
+ rx = &vif->rx_shadow_ring[MASK_NET_RX_IDX(i)];
/* Give the buffer page back to the domain. */
page = &frame_table[rx->buf_pfn];
*/
spin_lock(&vif->tx_lock);
for ( i = vif->tx_req_cons;
- (i != shared_idxs->tx_req_prod) &&
- (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
- i = TX_RING_INC(i) )
+ (i != shared_idxs->tx_req_prod) &&
+ ((i-vif->tx_resp_prod) != TX_RING_SIZE);
+ i++ )
{
- make_tx_response(vif, shared_rings->tx_ring[i].req.id,
- RING_STATUS_DROPPED);
+ make_tx_response(vif, shared_rings->tx_ring[MASK_NET_TX_IDX(i)].req.id,
+ RING_STATUS_DROPPED);
}
vif->tx_req_cons = i;
spin_unlock(&vif->tx_lock);
perfc_incr(net_hypercalls);
- if ( copy_from_user(&op, uop, sizeof(op)) )
+ if ( unlikely(copy_from_user(&op, uop, sizeof(op)) != 0) )
return -EFAULT;
- if ( (op.vif >= MAX_DOMAIN_VIFS) ||
- ((vif = current->net_vif_list[op.vif]) == NULL) )
+ if ( unlikely(op.vif >= MAX_DOMAIN_VIFS) ||
+ unlikely((vif = current->net_vif_list[op.vif]) == NULL) )
return -EINVAL;
switch ( op.cmd )
unsigned short id,
unsigned char st)
{
- unsigned int pos;
+ NET_RING_IDX i = vif->tx_resp_prod;
tx_resp_entry_t *resp;
- /* Place on the response ring for the relevant domain. */
- pos = vif->tx_resp_prod;
- resp = &vif->shared_rings->tx_ring[pos].resp;
+ resp = &vif->shared_rings->tx_ring[MASK_NET_TX_IDX(i)].resp;
resp->id = id;
resp->status = st;
- pos = TX_RING_INC(pos);
- vif->tx_resp_prod = vif->shared_idxs->tx_resp_prod = pos;
+ wmb();
+ vif->shared_idxs->tx_resp_prod = vif->tx_resp_prod = ++i;
+
smp_mb(); /* Update producer before checking event threshold. */
- if ( pos == vif->shared_idxs->tx_event )
+ if ( i == vif->shared_idxs->tx_event )
{
unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
guest_event_notify(cpu_mask);
unsigned char st,
unsigned char off)
{
- unsigned int pos;
+ NET_RING_IDX i = vif->rx_resp_prod;
rx_resp_entry_t *resp;
- /* Place on the response ring for the relevant domain. */
- pos = vif->rx_resp_prod;
- resp = &vif->shared_rings->rx_ring[pos].resp;
+ resp = &vif->shared_rings->rx_ring[MASK_NET_RX_IDX(i)].resp;
resp->id = id;
resp->size = size;
resp->status = st;
resp->offset = off;
- pos = RX_RING_INC(pos);
- vif->rx_resp_prod = vif->shared_idxs->rx_resp_prod = pos;
+ wmb();
+ vif->shared_idxs->rx_resp_prod = vif->rx_resp_prod = ++i;
+
smp_mb(); /* Update producer before checking event threshold. */
- if ( pos == vif->shared_idxs->rx_event )
+ if ( i == vif->shared_idxs->rx_event )
{
unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
guest_event_notify(cpu_mask);
static unsigned int state = STATE_SUSPENDED;
static blk_ring_t *blk_ring;
-static unsigned int resp_cons; /* Response consumer for comms ring. */
-static unsigned int req_prod; /* Private request producer. */
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod; /* Private request producer. */
#define XDI_MAX 64
static xen_disk_info_t xlblk_disk_info; /* information about our disks/VBDs */
/* We plug the I/O ring if the driver is suspended or if the ring is full. */
-#define RING_PLUGGED ((BLK_RING_INC(req_prod) == resp_cons) || \
+#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \
(state != STATE_ACTIVE))
/*
blk_ring_req_entry_t *req;
struct buffer_head *bh;
- if ( nr_sectors >= (1<<9) ) BUG();
- if ( (buffer_ma & ((1<<9)-1)) != 0 ) BUG();
+ if ( unlikely(nr_sectors >= (1<<9)) )
+ BUG();
+ if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
+ BUG();
- if ( state == STATE_CLOSED )
+ if ( unlikely(state == STATE_CLOSED) )
return 1;
switch ( operation )
case XEN_BLOCK_WRITE:
gd = get_gendisk(device);
- /* Update the sector_number we'll pass down as appropriate; note
- that we could sanity check that resulting sector will be in
- this partition, but this will happen in xen anyhow */
+ /*
+ * Update the sector_number we'll pass down as appropriate; note that
+ * we could sanity check that resulting sector will be in this
+ * partition, but this will happen in xen anyhow.
+ */
sector_number += gd->part[MINOR(device)].start_sect;
if ( (sg_operation == operation) &&
(sg_dev == device) &&
(sg_next_sect == sector_number) )
{
- req = &blk_ring->ring[(req_prod-1)&(BLK_RING_SIZE-1)].req;
+ req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req;
bh = (struct buffer_head *)id;
bh->b_reqnext = (struct buffer_head *)req->id;
req->id = id;
}
/* Fill out a communications ring structure. */
- req = &blk_ring->ring[req_prod].req;
+ req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
req->id = id;
req->operation = operation;
req->sector_number = sector_number;
req->device = device;
req->nr_segments = 1;
req->buffer_and_sects[0] = buffer_ma | nr_sectors;
- req_prod = BLK_RING_INC(req_prod);
+ req_prod++;
return 0;
}
req->current_nr_sectors, req->nr_sectors, req->bh);
rw = req->cmd;
- if ( rw == READA ) rw = READ;
- if ((rw != READ) && (rw != WRITE))
+ if ( rw == READA )
+ rw = READ;
+ if ( unlikely((rw != READ) && (rw != WRITE)) )
panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
req->errors = 0;
(rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE,
bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
- if(full) {
-
+ if ( full )
+ {
bh->b_reqnext = next_bh;
pending_queues[nr_pending++] = rq;
- if ( nr_pending >= MAX_PENDING ) BUG();
+ if ( unlikely(nr_pending >= MAX_PENDING) )
+ BUG();
goto out;
-
}
queued++;
else
{
/* That was the last buffer head. Finalise the request. */
- if ( end_that_request_first(req, 1, "XenBlk") ) BUG();
+ if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
+ BUG();
blkdev_dequeue_request(req);
end_that_request_last(req);
}
{
/* We kick pending request queues if the ring is reasonably empty. */
if ( (nr_pending != 0) &&
- (((req_prod - resp_cons) & (BLK_RING_SIZE - 1)) <
- (BLK_RING_SIZE >> 1)) )
+ ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) )
{
/* Attempt to drain the queue, but bail if the ring becomes full. */
- while ( nr_pending != 0 )
- {
+ while ( (nr_pending != 0) && !RING_PLUGGED )
do_xlblk_request(pending_queues[--nr_pending]);
- if ( RING_PLUGGED ) break;
- }
}
}
static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
{
- int i;
+ BLK_RING_IDX i;
unsigned long flags;
struct buffer_head *bh, *next_bh;
- if ( state == STATE_CLOSED )
+ if ( unlikely(state == STATE_CLOSED) )
return;
spin_lock_irqsave(&io_request_lock, flags);
- for ( i = resp_cons;
- i != blk_ring->resp_prod;
- i = BLK_RING_INC(i) )
+ for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
{
- blk_ring_resp_entry_t *bret = &blk_ring->ring[i].resp;
- switch (bret->operation)
+ blk_ring_resp_entry_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+ switch ( bret->operation )
{
case XEN_BLOCK_READ:
case XEN_BLOCK_WRITE:
- if ( bret->status )
+ if ( unlikely(bret->status != 0) )
DPRINTK("Bad return from blkdev data request: %lx\n",
bret->status);
for ( bh = (struct buffer_head *)bret->id;
#define NET_IRQ _EVENT_NET
-#define TX_MAX_ENTRIES (TX_RING_SIZE - 2)
-#define RX_MAX_ENTRIES (RX_RING_SIZE - 2)
-
-#define TX_RING_INC(_i) (((_i)+1) & (TX_RING_SIZE-1))
-#define RX_RING_INC(_i) (((_i)+1) & (RX_RING_SIZE-1))
-#define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
-#define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
-
#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs);
struct net_device *dev;
struct net_device_stats stats;
- atomic_t tx_entries;
- unsigned int rx_resp_cons, tx_resp_cons, tx_full;
- unsigned int net_ring_fixmap_idx;
- net_ring_t *net_ring;
- net_idx_t *net_idx;
- spinlock_t tx_lock;
+ NET_RING_IDX rx_resp_cons, tx_resp_cons;
+ unsigned int net_ring_fixmap_idx, tx_full;
+ net_ring_t *net_ring;
+ net_idx_t *net_idx;
+ spinlock_t tx_lock;
unsigned int idx; /* Domain-specific index of this VIF. */
unsigned int rx_bufs_to_notify;
#define GET_ID_FROM_FREELIST(_list) \
({ unsigned long _id = (unsigned long)(_list)[0]; \
(_list)[0] = (_list)[_id]; \
- _id; })
+ (unsigned short)_id; })
static void _dbg_network_int(struct net_device *dev)
if ( np->state == STATE_CLOSED )
return;
- printk(KERN_ALERT "tx_full = %d, tx_entries = %d, tx_resp_cons = %d,"
- " tx_req_prod = %d, tx_resp_prod = %d, tx_event = %d, state=%d\n",
- np->tx_full, atomic_read(&np->tx_entries), np->tx_resp_cons,
+ printk(KERN_ALERT "tx_full = %d, tx_resp_cons = 0x%08x,"
+ " tx_req_prod = 0x%08x, tx_resp_prod = 0x%08x,"
+ " tx_event = 0x%08x, state=%d\n",
+ np->tx_full, np->tx_resp_cons,
np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod,
np->net_idx->tx_event,
test_bit(__LINK_STATE_XOFF, &dev->state));
- printk(KERN_ALERT "rx_resp_cons = %d,"
- " rx_req_prod = %d, rx_resp_prod = %d, rx_event = %d\n",
+ printk(KERN_ALERT "rx_resp_cons = 0x%08x,"
+ " rx_req_prod = 0x%08x, rx_resp_prod = 0x%08x, rx_event = 0x%08x\n",
np->rx_resp_cons, np->net_idx->rx_req_prod,
np->net_idx->rx_resp_prod, np->net_idx->rx_event);
}
np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
memset(&np->stats, 0, sizeof(np->stats));
spin_lock_init(&np->tx_lock);
- atomic_set(&np->tx_entries, 0);
memset(np->net_ring, 0, sizeof(*np->net_ring));
memset(np->net_idx, 0, sizeof(*np->net_idx));
static void network_tx_buf_gc(struct net_device *dev)
{
- unsigned int i;
+ NET_RING_IDX i, prod;
+ unsigned short id;
struct net_private *np = dev->priv;
struct sk_buff *skb;
- unsigned int prod;
tx_entry_t *tx_ring = np->net_ring->tx_ring;
do {
prod = np->net_idx->tx_resp_prod;
- for ( i = np->tx_resp_cons; i != prod; i = TX_RING_INC(i) )
+ for ( i = np->tx_resp_cons; i != prod; i++ )
{
- skb = np->tx_skbs[tx_ring[i].resp.id];
- ADD_ID_TO_FREELIST(np->tx_skbs, tx_ring[i].resp.id);
+ id = tx_ring[MASK_NET_TX_IDX(i)].resp.id;
+ skb = np->tx_skbs[id];
+ ADD_ID_TO_FREELIST(np->tx_skbs, id);
dev_kfree_skb_any(skb);
- atomic_dec(&np->tx_entries);
}
np->tx_resp_cons = prod;
- /* Set a new event, then check for race with update of tx_cons. */
- np->net_idx->tx_event =
- TX_RING_ADD(prod, (atomic_read(&np->tx_entries)>>1) + 1);
+ /*
+ * Set a new event, then check for race with update of tx_cons. Note
+ * that it is essential to schedule a callback, no matter how few
+ * buffers are pending. Even if there is space in the transmit ring,
+ * higher layers may be blocked because too much data is outstanding:
+ * in such cases notification from Xen is likely to be the only kick
+ * that we'll get.
+ */
+ np->net_idx->tx_event =
+ prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1;
mb();
}
while ( prod != np->net_idx->tx_resp_prod );
- if ( np->tx_full && (atomic_read(&np->tx_entries) < TX_MAX_ENTRIES) )
+ if ( np->tx_full && ((np->net_idx->tx_req_prod - prod) < TX_RING_SIZE) )
{
np->tx_full = 0;
if ( np->state == STATE_ACTIVE )
static void network_alloc_rx_buffers(struct net_device *dev)
{
- unsigned int i, id;
+ unsigned short id;
struct net_private *np = dev->priv;
struct sk_buff *skb;
- unsigned int end = RX_RING_ADD(np->rx_resp_cons, RX_MAX_ENTRIES);
netop_t netop;
+ NET_RING_IDX i = np->net_idx->rx_req_prod;
- if ( ((i = np->net_idx->rx_req_prod) == end) ||
- (np->state != STATE_ACTIVE) )
+ if ( unlikely((i - np->rx_resp_cons) == RX_RING_SIZE) ||
+ unlikely(np->state != STATE_ACTIVE) )
return;
do {
skb = dev_alloc_skb(RX_BUF_SIZE);
- if ( skb == NULL ) break;
+ if ( unlikely(skb == NULL) )
+ break;
+
skb->dev = dev;
if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
id = GET_ID_FROM_FREELIST(np->rx_skbs);
np->rx_skbs[id] = skb;
- np->net_ring->rx_ring[i].req.id = (unsigned short)id;
- np->net_ring->rx_ring[i].req.addr =
+ np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id;
+ np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr =
virt_to_machine(get_ppte(skb->head));
np->rx_bufs_to_notify++;
}
- while ( (i = RX_RING_INC(i)) != end );
+ while ( (++i - np->rx_resp_cons) != RX_RING_SIZE );
/*
* We may have allocated buffers which have entries outstanding in the page
flush_page_update_queue();
np->net_idx->rx_req_prod = i;
- np->net_idx->rx_event = RX_RING_INC(np->rx_resp_cons);
+ np->net_idx->rx_event = np->rx_resp_cons + 1;
/* Batch Xen notifications. */
- if ( np->rx_bufs_to_notify > (RX_MAX_ENTRIES/4) )
+ if ( np->rx_bufs_to_notify > (RX_RING_SIZE/4) )
{
netop.cmd = NETOP_PUSH_BUFFERS;
netop.vif = np->idx;
static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
- unsigned int i, id;
+ unsigned short id;
struct net_private *np = (struct net_private *)dev->priv;
+ tx_req_entry_t *tx;
netop_t netop;
+ NET_RING_IDX i;
- if ( np->tx_full )
+ if ( unlikely(np->tx_full) )
{
printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
netif_stop_queue(dev);
return -ENOBUFS;
}
- if ( (((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= PAGE_SIZE )
+ if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
+ PAGE_SIZE) )
{
struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
- if ( new_skb == NULL ) return 1;
+ if ( unlikely(new_skb == NULL) )
+ return 1;
skb_put(new_skb, skb->len);
memcpy(new_skb->data, skb->data, skb->len);
dev_kfree_skb(skb);
id = GET_ID_FROM_FREELIST(np->tx_skbs);
np->tx_skbs[id] = skb;
- np->net_ring->tx_ring[i].req.id = (unsigned short)id;
- np->net_ring->tx_ring[i].req.addr =
- phys_to_machine(virt_to_phys(skb->data));
- np->net_ring->tx_ring[i].req.size = skb->len;
- np->net_idx->tx_req_prod = TX_RING_INC(i);
- atomic_inc(&np->tx_entries);
+ tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req;
+
+ tx->id = id;
+ tx->addr = phys_to_machine(virt_to_phys(skb->data));
+ tx->size = skb->len;
+
+ wmb();
+ np->net_idx->tx_req_prod = i + 1;
network_tx_buf_gc(dev);
- if ( atomic_read(&np->tx_entries) >= TX_MAX_ENTRIES )
+ if ( (i - np->tx_resp_cons) == TX_RING_SIZE )
{
np->tx_full = 1;
netif_stop_queue(dev);
static inline void _network_interrupt(struct net_device *dev)
{
struct net_private *np = dev->priv;
- unsigned int i;
unsigned long flags;
struct sk_buff *skb;
rx_resp_entry_t *rx;
+ NET_RING_IDX i;
- if ( np->state == STATE_CLOSED )
+ if ( unlikely(np->state == STATE_CLOSED) )
return;
spin_lock_irqsave(&np->tx_lock, flags);
spin_unlock_irqrestore(&np->tx_lock, flags);
again:
- for ( i = np->rx_resp_cons;
- i != np->net_idx->rx_resp_prod;
- i = RX_RING_INC(i) )
+ for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ )
{
- rx = &np->net_ring->rx_ring[i].resp;
+ rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp;
skb = np->rx_skbs[rx->id];
ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
- if ( rx->status != RING_STATUS_OK )
+ if ( unlikely(rx->status != RING_STATUS_OK) )
{
/* Gate this error. We get a (valid) slew of them on suspend. */
if ( np->state == STATE_ACTIVE )
/* Deal with hypervisor racing our resetting of rx_event. */
mb();
- if ( np->net_idx->rx_resp_prod != i ) goto again;
+ if ( np->net_idx->rx_resp_prod != i )
+ goto again;
}